###---###---###---###---###---###---###---###---###---###---
# Kerice Doten-Snitker
# 
# Main script for: The Diffusion of Exclusion: Medieval Expulsions of Jews
# Data management then analyses
#    1) source script to clean settlement timing data
#    2) source script to  clean settlement covariate data
#    3) prep annualized data
#    4) create diffusion measures
#    5) run analyses via compiling Rmd
# 
# Built under R version 4.1.3 (2022-03-10) -- "One Push-Up" 
# Platform: x86_64-apple-darwin17.0 (64-bit)
###---###---###---###---###---###---###---###---###---###---


###---###---###---###---###---###---###---###---###---###---###
### 0. Set up the working environment ----------------------------
###---###---###---###---###---###---###---###---###---###---###

# set locale to preempt UTF-8 issues with the mydata
#Sys.getlocale()
# For Mac
Sys.setlocale(category = "LC_ALL", locale = "en_US.UTF-8")
# for Windows
# Sys.setlocale(category = "LC_ALL", locale = "English_United States.1252")

# add packages with the pacman library, which installs if not installed
library(pacman)
# use the here package for improving replicability
p_load(here)
# for data manipulation
p_load(magrittr, plyr, tidyverse, zoo, reshape2, abind, haven, cubelyr)
# for geographical analyses and calculations
p_load(geosphere, raster, gdistance, rgdal, rgeos, spatstat, ncdf4, maptools, spdep)
p_load(igraph, riverdist)
# for various model types and simulations
p_load(rstan, rstanarm) # for Bayesian regression (Gelman group)
p_load(ggeffects, sjstats)
# for visuals
p_load(corrplot, cowplot, bayesplot, sjPlot, tmap)
# for output
p_load(pander, knitr, stargazer, texreg, gridExtra, kableExtra)

set.seed(1418)
options(scipen=999) # don't use scientific notation

###---###---###---###---###---###---###---###---###---###---
# 1. Add annual data to the R environment ####
###---###---###---###---###---###---###---###---###---###---

# A word of warning before getting started:
# Every time I open and resave a CSV with UTF-8 characters in MS Excel,
# the German characters get messed up, and I have to replace them.
# Don't edit the csv's willy-nilly! And save them as UTF-8 encoded!

source(here::here("Diffusion","Diffusion_prep.R"))
saveRDS(mydata_y, here::here("in_data", "diffusion.rds"))
# or, grab the already-prepped file (if previously run)
# includes 2-year left-padding before any obs where persecution is first year observed
# mydata_spells <- readRDS(here::here("in_data", "diffusion.rds"))
nrow(mydata_spells) # is 75396

# If Jews are gone and reappear in the same year, the data manipulation to go from spells to yearly will have duplicated Place-Year observations. This happens for three places: ID_263: Year=1350; ID_605: Year=1306; ID_984: Year= 1349
# drop the duplicated rows
mydata_spells <- mydata_spells %>%
  distinct(Year, PlaceID, .keep_all = TRUE)

nrow(mydata_spells) # is now 75393

###---###---###---###---###---###---###---###---###---###---
# 2. Add annual dominion data ####
###---###---###---###---###---###---###---###---###---###---

source(here::here("Diffusion","Dominion_prep.R"))
saveRDS(mydata_d, here::here("Diffusion","out_files", "dominion_annual.rds"))
# or, grab the already-prepped file
# mydata_d <- readRDS(here::here("Diffusion","out_files", "dominion_annual.rds"))

# combo of PlaceID and Year will create match on the right covariates; leave off duplicated PlaceName
mydata_spells_dom <- merge(mydata_spells, mydata_d[-1], by.x=c("Year","PlaceID"), by.y=c("Years", "PlaceID"), all.x=TRUE, all.y=FALSE)

# still the same number of rows?
nrow(mydata_spells_dom) # annual obs of settlement + dominion

mydata_spells_dom <- mydata_spells_dom %>%
  group_by(PlaceID) %>% 
  arrange(Year) %>%
  # fill in non-included years so that moving sums are correct
  complete(nesting(PlaceID), Year = seq(min(Year), max(Year), 1L)) %>%
  mutate(
    DomChgTotal = cumsum(ifelse(is.na(DomChg), 0, DomChg))-DomChg, # cumulative prior transitions
    DomChg_sum5 = rollapplyr(DomChg, width=list(-1:-5), sum, # prior 5 year moving sum
                          na.rm=TRUE, fill=0, align="right", partial=TRUE),
    DomChg_sum10 = rollapplyr(DomChg, width=list(-1:-10), sum, # 10 year
                           na.rm=TRUE, fill=0, align="right", partial=TRUE)) %>%
  ungroup() %>%
  filter(!is.na(ObsID_Yr)) %>%
  arrange(Year, PlaceID)

###---###---###---###---###---###---###---###---###---###---
# 3. Add the base (periodized) data ####
###---###---###---###---###---###---###---###---###---###---

source(here::here("Periods_prep.R"))
saveRDS(mydata_b, here::here("in_data", "periods_base.rds"))
# or, grab the already-prepped file
# mydata_periods <- readRDS(here::here("in_data", "periods_base.rds"))

# correct coords for Kempten
mydata_periods$XCOORD[mydata_periods$PlaceName=="Kempten"] <- 10.316667	
mydata_periods$YCOORD[mydata_periods$PlaceName=="Kempten"] <- 47.733333

nrow(mydata_periods) # periodized settlement-period obs
length(unique(mydata_periods$PlaceName)) # unique names
length(unique(mydata_periods$ObsID)) # unique IDs

# combo of PlaceID and Period2 will create match on the right period-based covariates; leave off duplicated PlaceName
mydata_base <- merge(mydata_spells_dom, mydata_periods[-c(1,17:32,41:54,93:95,97:106,109:133)], by.x=c("PlaceID", "Period2"), by.y=c("PlaceID", "Period2"), all.x=TRUE, all.y=FALSE)

# due to coding decisions about periods, sometimes no period data
# so remove rows with no matching covariates
# reorder by unique ID, which will sort by year, too
mydata_base <- mydata_base %>%
  filter(!is.na(XCOORD)) %>% 
  arrange(ObsID_Yr)

mydata_base <- mydata_base %>%
  mutate(Agent = ifelse(Schlthss==1|Amt==1, 1, 0) )
  
nrow(mydata_base) # 75331
length(unique(mydata_base$ObsID_Yr)) # unique settlement-year obs
length(unique(mydata_base$ObsID)) # unique settlement-year obs
length(unique(mydata_base$PlaceName)) # unique names
length(unique(mydata_base$PlaceID)) # unique settlement IDs

###---###---###---###---###---###---###---###---###---###---
# 4. Add annual Städtebunde data to the R environment ####
###---###---###---###---###---###---###---###---###---###---

# entered from Distler, Eva-Marie (2006)
source(here::here("Bunde_prep.R"))
saveRDS(mydata_s3, here::here("in_data", "Distler_Staedtebunde_prepped.rds"))
# grab the already-prepped file
# mydata_bunde <- readRDS(here::here("in_data", "Distler_Staedtebunde_prepped.rds"))

# some duplicates for place-year combo because cities concluded multiple 
# agreements in the same year (usually with different partners), 
# so drop the participants column and then summarize by year and place
# to get total count in a year while removing duplicates

mydata_bunde_prepped <- mydata_bunde %>%
  group_by(Year, PlaceID) %>%
  summarize(BundTot = sum(Bund),
            Bund_peers = paste(Participants, collapse=", ")) %>% # these may have duplicates
  ungroup() %>%
  group_by(PlaceID) %>%
  arrange(Year) %>%
  mutate(BundTotPrev = cumsum(lag(BundTot,1,default=0)))

mydata_bunde_prepped <- mydata_bunde %>%
  dplyr::select(Year, PlaceID, PlaceName, Bund) %>%
  distinct() %>%
  merge(mydata_bunde_prepped, by=c("Year","PlaceID")) %>%
  # ungroup() %>%
  arrange(Year, PlaceID)

mydata <- mydata_bunde_prepped %>%
  dplyr::select(Year, PlaceID, Bund, BundTot, Bund_peers, BundTotPrev) %>%
  merge(x=mydata_base,y=., by=c("Year","PlaceID"), all.x=TRUE, all.y=FALSE) %>%
  arrange(Year, PlaceID)

nrow(mydata)
length(unique(mydata$ObsID_Yr)) # unique settlement-year obs


# merging will fill NAs where no matching Bund data, so replace with 0's

mydata <- mydata %>%
  group_by(PlaceID) %>% 
  arrange(Year) %>%
  # fill in non-included years so that moving sums are correct
  complete(nesting(PlaceID), Year = seq(min(Year), max(Year), 1L)) %>%
  fill(BundTotPrev, .direction = "updown") %>%
  mutate_at(vars(Bund, BundTot, BundTotPrev), funs(replace(.,is.na(.),0))) %>%
  mutate(BundTot_lag1 = lag(BundTot,1)) %>%
  mutate(
    BundEver = ifelse(BundTotPrev>0,1,0),
    Bund_sum5 = rollapplyr(BundTot_lag1, width=5, sum, # prior 5 year moving sum
                          na.rm=TRUE, fill=0, align="right", partial=TRUE),
    Bund_sum10 = rollapplyr(BundTot_lag1, width=10, sum, # 10 year
                           na.rm=TRUE, fill=0, align="right", partial=TRUE)) %>%
  ungroup() %>%
  filter(!is.na(ObsID_Yr)) %>%
  arrange(Year, PlaceID) 

###---###---###---###---###---###---###---###---###---###---
# 5. Add annual pogrom data ####
###---###---###---###---###---###---###---###---###---###---

source(here::here("Diffusion","Pogroms_prep.R"))
saveRDS(mydata_pm, here::here("Diffusion","out_files", "pogroms_annual.rds"))
# or, grab the already-prepped file
# mydata_pm <- readRDS(here::here("Diffusion","out_files", "pogroms_annual.rds"))

mydata <- merge(mydata, mydata_pm[-c(1,3,4)], by.x=c("Year", "PlaceID"), by.y=c("VfgYr", "PlaceID"), all.x=TRUE, all.y=FALSE)

# merging will fill NAs where no matching pogrom data, so replace with 0's

mydata <- mydata %>%
  mutate_at(vars(names(mydata_pm)[-c(1:6)]), funs(replace(.,is.na(.),0))) %>%
  group_by(PlaceID) %>% 
  arrange(Year) %>%
  # fill in non-included years so that moving sums are correct
  complete(nesting(PlaceID), Year = seq(min(Year), max(Year), 1L)) %>%
  mutate(
    # sums as inclusive because conceivably any in the same year happened BEFORE an expulsion
    VfgTotal = cumsum(ifelse(is.na(Vfg), 0, Vfg)), # cumulative, inclusive of year
    Vfg_sum5 = rollapplyr(Vfg, width=5, sum, # inclusive 5 year moving sum
      na.rm=TRUE, fill=0, align="right", partial=TRUE),
    Vfg_sum10 = rollapplyr(Vfg, width=10, sum, # 10 year
      na.rm=TRUE, fill=0, align="right", partial=TRUE)) %>%
  ungroup() %>%
  filter(!is.na(ObsID_Yr)) %>%
  arrange(Year, PlaceID)


###---###---###---###---###---###---###---###---###---###---
# 6. Create year-level binary of expulsion observation ####
###---###---###---###---###---###---###---###---###---###---

mydata$Vrtrb <- rep(0, nrow(mydata))
mydata$Vrtrb[mydata$VrtrbType1==1&mydata$VrtrbYr1==mydata$Year] <- 1
mydata$Vrtrb[mydata$VrtrbType2==1&mydata$VrtrbYr2==mydata$Year] <- 1

#table(mydata_b$VrtrbObs)
# one location of 3 expulsions in a period 
# Konstanz: 1432 decree, with 1433 repetition, neither carried out
# plus 1448 - only recording 1432 and 1448, as 1433 was repetition

# consider whether attempts might prompt successful expulsions elsewhere
# recode to include attempted expulsions
mydata$Vrtrb2 <- rep(0, nrow(mydata))
mydata$Vrtrb2[mydata$VrtrbType1==1&mydata$VrtrbYr1==mydata$Year] <- 1
mydata$Vrtrb2[mydata$VrtrbType2==1&mydata$VrtrbYr2==mydata$Year] <- 1
mydata$Vrtrb2[mydata$VrtrbType1==9&mydata$VrtrbYr1==mydata$Year] <- 1
mydata$Vrtrb2[mydata$VrtrbType2==9&mydata$VrtrbYr2==mydata$Year] <- 1

###---###---###---###---###---###---###---###---###---###---
# 7. Further recoding ####
###---###---###---###---###---###---###---###---###---###---

# Make Sector a factor because we don't care what the value is and will only ever use it as a grouping variable
mydata$Sector_f <- factor(mydata$Sector, levels=c(sort(unique(mydata$Sector))), labels=c(sort(unique(mydata$Sector))))

# Having an ID field that isn't numeric lets us use those IDs as row and column names, which we will need when making the distance matrix and then using a function to look up distances by ID in the distance matrix
mydata <- mydata %>%
  mutate(newID = paste("ID", PlaceID, sep="_")) %>% # add field used in climate data
  arrange(PlaceID, Period2)

# add in a few more manipulations
mydata <- mydata %>%
  group_by(PlaceID) %>%
  arrange(Year) %>%
  mutate(
    DomChgTotal = cumsum(ifelse(is.na(DomChg), 0, DomChg))-DomChg,
    VfgTotal = cumsum(ifelse(is.na(Vfg), 0, Vfg))) %>%
  ungroup()

saveRDS(mydata, here::here("in_data", "diffusion_prepped.rds"))
# or, grab the already-prepped file
# mydata <- readRDS(here::here("in_data", "diffusion_prepped.rds"))

###---###---###---###---###---###---###---###---###---###---
# 8. Locations ####
###---###---###---###---###---###---###---###---###---###---

# The original maps in Geschicte der Juden do not specify a map projection. My guess is that the original maps are either ETRS 1989 or Gauß-Krüger, given that these would have been standard in Germany at the time. When I georectified, I used WGS84 (EPSG:4326). The prepped file with all the covariates includes the WGS84 lat-long coordinates. WGS84 worked well for visualization, but in this analysis I need a projection that will be the most accurate for measuring distances between cities and whose calculations will be in meters, a more meaningful unit than degrees. 
summary(mydata$XCOORD) # min: 3.28 max: 11.37
summary(mydata$YCOORD) # min: 44.28 max: 53.61
# Judging by longitudes (XCOORD), the extent of the study area includes UTM Zones 31 and 31. Judging by latitudes (YCOORD), the extent of the study area includes UTM bands T and U. Using a UTM projection would be onerous, and it would still have bias issues. The best projection will be a Lambert azimuthal equal-area projection. I use the projection EPSG:3035. I reprojected the shapefile in QGIS. Now, add in the LAEA coordinates and drop the WGS84 coordinates. 
# potentially useful: the Proj.4 string for the projection I chose
crs_EPSG3035 <- "+proj=laea +lat_0=52 +lon_0=10 +x_0=4321000 +y_0=3210000 +ellps=GRS80 +towgs84=0,0,0,0,0,0,0 +units=m +no_defs"
# or use the same projection but with the centroid of this study's area
# crs_laea <- "+proj=laea +lat_0=48.91524085 +lon_0=7.47070312 +x_0=4135598.48 +y_0=2870072.51 +ellps=GRS80 +towgs84=0,0,0,0,0,0,0 +units=m +no_defs"
# and here's the Proj.4 for WGS84
crs_WGS84 <- "+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"

# I need to replace the WGS84 coordinates with EPSG:3035, so I'll get the reprojected coordinates from the shapefile, merge them in, and drop the old ones.

city_coords_meters_sp <- readOGR(here::here("GIS_data","Vectors", "GderJuden_cleaned"), "GdJ_cleaned_nocov_meters", GDAL1_integer64_policy = TRUE, stringsAsFactors=FALSE)

city_coords_meters <- city_coords_meters_sp %>%
  data.frame() %>% # extract all the data
  dplyr::select(PlaceID, coords.x1, coords.x2) %>% # drop the WGS84 coordinate columns
  rename(XCOORD=coords.x1, YCOORD=coords.x2) %>% # rename the EPSG3035 columns as x and y
  mutate(newID = paste("ID", PlaceID, sep="_")) # make an ID field that isn't numeric

mydata <- mydata %>%
  dplyr::select(-XCOORD, -YCOORD) %>% # drop the WGS84 coordinate columns
  left_join(city_coords_meters, by=c("PlaceID", "newID")) %>%
  arrange(Year, PlaceID)
  
###---###---###---###---###---###---###---###---###---###---
### 9. Temporal diffusion: count of x=1 in period t-1 ------------
###---###---###---###---###---###---###---###---###---###---

# same as Hedström (1994), Andrews and Biggs (2006), and Braun (2011)

# Make a matrix by year of annual and cumulative totals
# and list-column of IDs where Expl=1 for each year, for lookup
# NA filled where there was no prev value to lag (Year=1000)
# and when the rolling sum included an NA input

# per Yuan, think about weighting by year!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

Expl.matrix <- mydata %>%
  dplyr::select(Year, newID, Vrtrb) %>%
  group_by(Year) %>% 
  summarize(Expl_ann = sum(Vrtrb)) %>% # total in a year
  # before lag, make sure all years included
  complete(Year=1000:1520) %>% 
  arrange(Year) %>% # and make sure years are in order
  # and that any added years don't have NAs, so that rolling sums evaluate
  replace_na(list(Expl_ann = 0)) %>% 
  mutate(Expl_lag1=lag(Expl_ann, n=1, default = NA), # lag by 1 year
    Expl_sum5 = rollapplyr(Expl_ann, width=list(-1:-5), sum, # 5 year moving sum
      na.rm=TRUE, fill=NA, align="right"),
    Expl_sum10 = rollapplyr(Expl_ann, width=list(-1:-10), sum, # 10 year 
      na.rm=TRUE, fill=NA, align="right"),
    Expl_sum20 = rollapplyr(Expl_ann, width=list(-1:-20), sum, # 20 year
      na.rm=TRUE, fill=NA, align="right"),
    Expl_sum30 = rollapplyr(Expl_ann, width=list(-1:-30), sum, # 30 year
      na.rm=TRUE, fill=NA, align="right"))

# Also, with package updates, need to update the part where I make rolling lists of IDs - see witches scripts ----

# for reference, can write explusion table to csv
# write.csv(Expl.matrix,here::here("Diffusion", "out_files", "Expl_matrix.csv"), row.names=TRUE)

# add list columns of the place IDs where Vrtrb=1 in time t and t-1
Expl.matrix <- mydata %>%
  dplyr::select(Year, newID, Vrtrb) %>%
  group_by(Year) %>%
  filter(Vrtrb==1) %>%
  dplyr::select(-Vrtrb) %>%
  nest(., Expl_IDs_0 = c(newID)) %>%
  full_join(Expl.matrix, by="Year") %>% # keep all rows of years, add ID lists
  ungroup() %>%
  arrange(Year) %>%
  mutate(Expl_IDs_1=lag(Expl_IDs_0, n=1, default = list(NULL))) # lag by 1 year

# flatten from tibble to list
Expl_IDs_1 <- vector("list", dim(Expl.matrix)[1])
for (i in 2:dim(Expl.matrix)[1]){
  flat.ids <- Expl.matrix$Expl_IDs_1[i][[1]]
  #flat.ids <- flat.ids[!sapply(flat.ids,is.null)]
  if(length(flat.ids)!=0){
    Expl_IDs_1[i][[1]] <- Reduce(rbind, flat.ids)
  } else {Expl_IDs_1[i] <- list(NULL)}
}

Expl.matrix$Expl_IDs_1 <- Expl_IDs_1

# saveRDS(Expl.matrix, here::here("Diffusion","out_files", "Expl_matrix.rds"))
# or, grab the already-prepped file
Expl.matrix <- readRDS(here::here("Diffusion","out_files", "Expl_matrix.rds"))


### Same, but for pogroms (Verfolgungen), in case I want to compare
# Make a matrix by year of annual and cumulative totals
# and list-column of IDs where Vfg=1 for each year, for lookup
# NA filled where there was no prev value to lag (Year=1000)
# and when the rolling sum included an NA input
# *Remember*, Vfg is *inclusive*, unlike Vrtrb
Pogr.matrix <- mydata %>%
  dplyr::select(Year, newID, Vfg) %>%
  group_by(Year) %>% 
  summarize(Pogr_ann = sum(Vfg)) %>% # total in a year
  mutate(Pogr_lag1=lag(Pogr_ann, n=1, default = NA), # lag by 1 year
    Pogr_sum5 = rollapplyr(Pogr_ann, width=list(-1:-5), sum, # 5 year moving sum
                                na.rm=TRUE, fill=NA, align="right"),
    Pogr_sum10 = rollapplyr(Pogr_ann, width=list(-1:-10), sum, # 10 year
                                 na.rm=TRUE, fill=NA, align="right"),
    Pogr_sum20 = rollapplyr(Pogr_ann, width=list(-1:-20), sum, # 20 year 
                                 na.rm=TRUE, fill=NA, align="right"),
    Pogr_sum30 = rollapplyr(Pogr_ann, width=list(-1:-30), sum, # 30 year
                                 na.rm=TRUE, fill=NA, align="right"))

# for reference, can write explusion table to csv
write.csv(Pogr.matrix,here::here("Diffusion", "out_files", "Pogr_matrix.csv"), row.names=TRUE)

saveRDS(Pogr.matrix, here::here("Diffusion","out_files", "Pogr_matrix.rds"))
# or, grab the already-prepped file
# Pogr.matrix <- readRDS(here::here("Diffusion","out_files", "Pogr_matrix.rds"))

# merge expulsion calculations into the main data
mydata <- mydata %>%
  left_join(Expl.matrix, by="Year") %>% # keep all rows of x (main df), add y (sums)
  left_join(Pogr.matrix, by="Year") %>% # keep all rows of x (main df), add y (sums)
  arrange(Year, PlaceID)

# saveRDS(mydata, here::here("in_data", "diffusion_DIF1.rds"))
# or, grab the already-prepped file
# mydata <- readRDS(here::here("in_data", "diffusion_DIF1.rds"))

###---###---###---###---###---###---###---###---###---###---
### 10. Segmented diffusion: count of x=1 in period t-1 for group G ------
###---###---###---###---###---###---###---###---###---###---

### Expulsion matrix, but by dominion type:
# Make a matrix by year of annual and cumulative totals and list-column of IDs where Vrtrb=1 for each year, for lookup.
# NA filled where there was no prev value to lag (Year=1000) and when the rolling sum included an NA input.
# Having all possible categories is good for exploration but not for model specification

Expl.matrix.dom <- mydata %>%
  dplyr::select(Year, ObsID_Yr, Vrtrb, imperial, free, KingBin, PrinceBin, LordBin, minorBin, ArchbishopBin, BishopBin, AuthEpisc, AuthRelOnly, AuthNobOnly) %>%
  group_by(Year) %>% 
  summarize(Expl_ann_imperial = sum(Vrtrb[imperial==1]),
            Expl_ann_free = sum(Vrtrb[free==1]),
            Expl_ann_King = sum(Vrtrb[KingBin==1]),
            Expl_ann_Prince = sum(Vrtrb[PrinceBin==1]),
            Expl_ann_Lord = sum(Vrtrb[LordBin==1]),
            Expl_ann_minor = sum(Vrtrb[minorBin==1]),
            Expl_ann_Archbishop = sum(Vrtrb[ArchbishopBin==1]),
            Expl_ann_Bishop = sum(Vrtrb[BishopBin==1]),
            Expl_ann_AuthEpisc = sum(Vrtrb[AuthEpisc==1]),
            Expl_ann_secular = sum(Vrtrb[AuthNobOnly==1]),
            Expl_ann_Church = sum(Vrtrb[AuthRelOnly==1])
            ) %>% # total in a year by dominion type
  # before lag, make sure all years included
  complete(Year=1000:1520) %>% 
  arrange(Year) %>% # and make sure years are in order
  # and that any added years don't have NAs, so that rolling sums evaluate
  replace_na(list(Expl_ann_imperial = 0, Expl_ann_free = 0, Expl_ann_King = 0,
                  Expl_ann_Prince = 0, Expl_ann_Lord = 0, Expl_ann_minor = 0,
                  Expl_ann_Archbishop = 0, Expl_ann_Bishop = 0, Expl_ann_AuthEpisc = 0,
                  Expl_ann_secular = 0, Expl_ann_Church = 0)) %>% 
  mutate(Expl_lag1_imperial=lag(Expl_ann_imperial, n=1, default = NA),
         Expl_lag1_free = lag(Expl_ann_free, n=1, default = NA),
         Expl_lag1_King = lag(Expl_ann_King, n=1, default = NA),
         Expl_lag1_Prince = lag(Expl_ann_Prince, n=1, default = NA),
         Expl_lag1_Lord = lag(Expl_ann_Lord, n=1, default = NA),
         Expl_lag1_minor = lag(Expl_ann_minor, n=1, default = NA),
         Expl_lag1_Archbishop = lag(Expl_ann_Archbishop, n=1, default = NA),
         Expl_lag1_Bishop = lag(Expl_ann_Bishop, n=1, default = NA),
         Expl_lag1_AuthEpisc = lag(Expl_ann_AuthEpisc, n=1, default = NA),
         Expl_lag1_secular = lag(Expl_ann_secular, n=1, default = NA),
         Expl_lag1_Church = lag(Expl_ann_Church, n=1, default = NA)
         ) %>% # lag each by 1 year
  mutate(
      Expl_sum5_imperial = rollapplyr(Expl_ann_imperial, width=list(-1:-5), sum, 
                                na.rm=TRUE, fill=NA, align="right"),
      Expl_sum5_free = rollapplyr(Expl_ann_free, width=list(-1:-5), sum, 
                                na.rm=TRUE, fill=NA, align="right"),
      Expl_sum5_King = rollapplyr(Expl_ann_King, width=list(-1:-5), sum, 
                                na.rm=TRUE, fill=NA, align="right"),
      Expl_sum5_Prince = rollapplyr(Expl_ann_Prince, width=list(-1:-5), sum, 
                                 na.rm=TRUE, fill=NA, align="right"),
      Expl_sum5_Lord = rollapplyr(Expl_ann_Lord, width=list(-1:-5), sum, 
                                 na.rm=TRUE, fill=NA, align="right"),
      Expl_sum5_minor = rollapplyr(Expl_ann_minor, width=list(-1:-5), sum, 
                                 na.rm=TRUE, fill=NA, align="right"),
      Expl_sum5_Archbishop = rollapplyr(Expl_ann_Archbishop, width=list(-1:-5), sum, 
                                 na.rm=TRUE, fill=NA, align="right"),
      Expl_sum5_Bishop = rollapplyr(Expl_ann_Bishop, width=list(-1:-5), sum, 
                                 na.rm=TRUE, fill=NA, align="right"),
      Expl_sum5_AuthEpisc = rollapplyr(Expl_ann_AuthEpisc, width=list(-1:-5), sum, 
                                    na.rm=TRUE, fill=NA, align="right"),
      Expl_sum5_secular = rollapplyr(Expl_ann_secular, width=list(-1:-5), sum, 
                                 na.rm=TRUE, fill=NA, align="right"),
      Expl_sum5_Church = rollapplyr(Expl_ann_Church, width=list(-1:-5), sum, 
                                 na.rm=TRUE, fill=NA, align="right")
      ) %>% # 5 year moving sum
  mutate(Expl_sum10_imperial = rollapplyr(Expl_ann_imperial, width=list(-1:-10), sum, 
                                         na.rm=TRUE, fill=NA, align="right"),
         Expl_sum10_free = rollapplyr(Expl_ann_free, width=list(-1:-10), sum, 
                                     na.rm=TRUE, fill=NA, align="right"),
         Expl_sum10_King = rollapplyr(Expl_ann_King, width=list(-1:-10), sum, 
                                     na.rm=TRUE, fill=NA, align="right"),
         Expl_sum10_Prince = rollapplyr(Expl_ann_Prince, width=list(-1:-10), sum, 
                                       na.rm=TRUE, fill=NA, align="right"),
         Expl_sum10_Lord = rollapplyr(Expl_ann_Lord, width=list(-1:-10), sum, 
                                     na.rm=TRUE, fill=NA, align="right"),
         Expl_sum10_minor = rollapplyr(Expl_ann_minor, width=list(-1:-10), sum, 
                                      na.rm=TRUE, fill=NA, align="right"),
         Expl_sum10_Archbishop = rollapplyr(Expl_ann_Archbishop, width=list(-1:-10), sum, 
                                           na.rm=TRUE, fill=NA, align="right"),
         Expl_sum10_Bishop = rollapplyr(Expl_ann_Bishop, width=list(-1:-10), sum, 
                                       na.rm=TRUE, fill=NA, align="right"),
         Expl_sum10_AuthEpisc = rollapplyr(Expl_ann_AuthEpisc, width=list(-1:-10), sum, 
                                        na.rm=TRUE, fill=NA, align="right"),
         Expl_sum10_secular = rollapplyr(Expl_ann_secular, width=list(-1:-10), sum, 
                                        na.rm=TRUE, fill=NA, align="right"),
         Expl_sum10_Church = rollapplyr(Expl_ann_Church, width=list(-1:-10), sum, 
                                       na.rm=TRUE, fill=NA, align="right")
  ) %>% # 10 year moving sum
  mutate(Expl_sum20_imperial = rollapplyr(Expl_ann_imperial, width=list(-1:-10), sum, 
                                          na.rm=TRUE, fill=NA, align="right"),
         Expl_sum20_free = rollapplyr(Expl_ann_free, width=list(-1:-20), sum, 
                                      na.rm=TRUE, fill=NA, align="right"),
         Expl_sum20_King = rollapplyr(Expl_ann_King, width=list(-1:-20), sum, 
                                      na.rm=TRUE, fill=NA, align="right"),
         Expl_sum20_Prince = rollapplyr(Expl_ann_Prince, width=list(-1:-20), sum, 
                                        na.rm=TRUE, fill=NA, align="right"),
         Expl_sum20_Lord = rollapplyr(Expl_ann_Lord, width=list(-1:-20), sum, 
                                      na.rm=TRUE, fill=NA, align="right"),
         Expl_sum20_minor = rollapplyr(Expl_ann_minor, width=list(-1:-20), sum, 
                                       na.rm=TRUE, fill=NA, align="right"),
         Expl_sum20_Archbishop = rollapplyr(Expl_ann_Archbishop, width=list(-1:-20), sum, 
                                            na.rm=TRUE, fill=NA, align="right"),
         Expl_sum20_Bishop = rollapplyr(Expl_ann_Bishop, width=list(-1:-20), sum, 
                                        na.rm=TRUE, fill=NA, align="right"),
         Expl_sum20_AuthEpisc = rollapplyr(Expl_ann_AuthEpisc, width=list(-1:-20), sum, 
                                        na.rm=TRUE, fill=NA, align="right"),
         Expl_sum20_secular = rollapplyr(Expl_ann_secular, width=list(-1:-20), sum, 
                                         na.rm=TRUE, fill=NA, align="right"),
         Expl_sum20_Church = rollapplyr(Expl_ann_Church, width=list(-1:-20), sum, 
                                        na.rm=TRUE, fill=NA, align="right")
  ) # 20 year moving sum

# add list columns of the place IDs where Vrtrb=1 in time t and t-1
Expl.matrix.dom <- mydata %>%
  dplyr::select(Year, newID, Vrtrb, imperial, free, KingBin, PrinceBin, LordBin, minorBin, ArchbishopBin, BishopBin, AuthEpisc, AuthRelOnly, AuthNobOnly) %>%
  group_by(Year) %>% 
  filter(Vrtrb==1) %>%
  gather(., imperial, free, KingBin, PrinceBin, LordBin, minorBin, ArchbishopBin, BishopBin, AuthEpisc, AuthRelOnly, AuthNobOnly, key=Expl_dom, value=value) %>%
  filter(value==1) %>%
  group_by(Year, Expl_dom) %>% # Now that grouped by Year AND Dom, get group counts
  nest(., Expl_IDs_0 = c(newID)) %>%
  dplyr::select(-Vrtrb, -value) %>%
  ungroup() %>%
  # Values are for year 0; before merging they need to be lagged 1 year
  mutate(Year=Year+1) %>%
  pivot_wider(names_from=Expl_dom, values_from=Expl_IDs_0, names_prefix="Expl_IDs_1_") %>%
  full_join(x=Expl.matrix.dom, y=., by="Year") %>% # keep all rows of years, add ID lists
  ungroup() %>% # just to make sure...
  arrange(Year)

# flatten from tibble to list
# vector of which columns to change
dom_ID_vars <- c("Expl_IDs_1_imperial", "Expl_IDs_1_free", 
                 "Expl_IDs_1_KingBin", "Expl_IDs_1_PrinceBin", 
                 "Expl_IDs_1_LordBin", "Expl_IDs_1_minorBin", 
                 "Expl_IDs_1_ArchbishopBin", "Expl_IDs_1_BishopBin", 
                 "Expl_IDs_1_AuthEpisc", 
                 "Expl_IDs_1_AuthRelOnly", "Expl_IDs_1_AuthNobOnly")

Expl.matrix.dom2 <- Expl.matrix.dom %>%
  dplyr::select(c("Year", dom_ID_vars))

for (i in 2:dim(Expl.matrix.dom2)[2]){ # for each dom variable after Year
  for (j in 1:dim(Expl.matrix.dom2)[1]){ # for each row of the variable
    flat.ids <- Expl.matrix.dom2[j,i][[1]] # grab id list
    flat.ids <- flatten(flat.ids) # reduce from tbl to list
    if(!is.null(unlist(flat.ids))){ # if not null:
      Expl.matrix.dom2[j,i][[1]] <- flat.ids}
    else {Expl.matrix.dom2[j,i][[1]] <- list(NULL)}
  }
}

Expl.matrix.dom <- Expl.matrix.dom %>%
  dplyr::select(-c(dom_ID_vars)) %>%
  join(Expl.matrix.dom2, by="Year")

# for reference, can write explusion table to csv; csv can't handle list columns
write.csv(Expl.matrix.dom[,1:41],here::here("Diffusion", "out_files", "Expl_matrix_dom.csv"), row.names=TRUE)

saveRDS(Expl.matrix.dom, here::here("Diffusion","out_files", "Expl_matrix_dom.rds"))
# or, grab the already-prepped file
Expl.matrix.dom <- readRDS(here::here("Diffusion","out_files", "Expl_matrix_dom.rds"))

## If status group is being an alliance member or non-members?
# I cut this out because it was difficult to make sense of it, with lagging both Bund membership and then expulsions based on (lagged) Bund membership. If I look at prior expulsions among cities that were in Bunds, then Bund membership gets double-lagged and is 1 year off.
# I should run some proportions - e.g., % of cities with prior Bund membership that expelled, % of cities without prior Bund membership that expelled.

# sum over Expl.matrix.pol where domparty==1

# Adjust where variables do not match the observation's dominion type
# (eg, Expl calculations for an imperial city should be NA for non-imperial)
# and then calculate within-group values!

mydata <- mydata %>%
  left_join(Expl.matrix.dom, by="Year") %>% # keep all rows of x (main df), add y (sums)
  arrange(Year, PlaceID) %>%
  mutate_at(.vars=vars(c(Expl_ann_imperial, Expl_lag1_imperial, Expl_sum5_imperial, Expl_sum10_imperial, Expl_sum20_imperial)), 
            .funs=~ifelse(imperial==1, ., NA)) %>%
  mutate_at(.vars=vars(c(Expl_ann_free, Expl_lag1_free, Expl_sum5_free, Expl_sum10_free, Expl_sum20_free)), 
            .funs=~ifelse(free==1, ., NA)) %>%
  mutate_at(.vars=vars(c(Expl_ann_King, Expl_lag1_King, Expl_sum5_King, Expl_sum10_King, Expl_sum20_King)), 
            .funs=~ifelse(KingBin==1, ., NA)) %>%
  mutate_at(.vars=vars(c(Expl_ann_Prince, Expl_lag1_Prince, Expl_sum5_Prince, Expl_sum10_Prince, Expl_sum20_Prince)), 
            .funs=~ifelse(PrinceBin==1, ., NA)) %>%
  mutate_at(.vars=vars(c(Expl_ann_Lord, Expl_lag1_Lord, Expl_sum5_Lord, Expl_sum10_Lord, Expl_sum20_Lord)), 
            .funs=~ifelse(LordBin==1, ., NA)) %>%
  mutate_at(.vars=vars(c(Expl_ann_minor, Expl_lag1_minor, Expl_sum5_minor, Expl_sum10_minor, Expl_sum20_minor)), 
            .funs=~ifelse(minorBin==1, ., NA)) %>%
  mutate_at(.vars=vars(c(Expl_ann_Archbishop, Expl_lag1_Archbishop, Expl_sum5_Archbishop, Expl_sum10_Archbishop, Expl_sum20_Archbishop)), 
            .funs=~ifelse(ArchbishopBin==1, ., NA)) %>%
  mutate_at(.vars=vars(c(Expl_ann_Bishop, Expl_lag1_Bishop, Expl_sum5_Bishop, Expl_sum10_Bishop, Expl_sum20_Bishop)), 
            .funs=~ifelse(BishopBin==1, ., NA)) %>%
  mutate_at(.vars=vars(c(Expl_ann_AuthEpisc, Expl_lag1_AuthEpisc, Expl_sum5_AuthEpisc, Expl_sum10_AuthEpisc, Expl_sum20_AuthEpisc)), 
            .funs=~ifelse(AuthEpisc==1, ., NA)) %>%
  mutate_at(.vars=vars(c(Expl_ann_secular, Expl_lag1_secular, Expl_sum5_secular, Expl_sum10_secular, Expl_sum20_secular)), 
            .funs=~ifelse(AuthNobOnly==1, ., NA)) %>%
  mutate_at(.vars=vars(c(Expl_ann_Church, Expl_lag1_Church, Expl_sum5_Church, Expl_sum10_Church, Expl_sum20_Church)), 
            .funs=~ifelse(AuthRelOnly==1, ., NA))

saveRDS(mydata, here::here("in_data", "diffusion_groups.rds"))
# or, grab the already-prepped file
mydata <- readRDS(here::here("in_data", "diffusion_groups.rds"))

###---###---###---###---###---###---###---###---###---###---
# 11. Spatial diffusion: distance-scaled count of x=1 in period t-1 ####
###---###---###---###---###---###---###---###---###---###---

# make newIDs vector to be used for indexing a distance matrix
# put in order by PlaceID because by newID goes 1,10,100,2,20,200
# (alphabetical order, because it's character and not numeric)
ID_list <- mydata %>%
  dplyr::select(PlaceID, newID) %>%
  distinct(newID, .keep_all = TRUE) %>%
  arrange(PlaceID) %>% 
  dplyr::select(-PlaceID)

# separate the settlement coordinates with expulsions
# include attempted expulsions (Vrtrb2==1), in case I decide later to include those
targets <- mydata %>%
  filter(Vrtrb2==1) %>% 
  arrange(PlaceID) %>%
  dplyr::select(newID, XCOORD, YCOORD) %>%
  distinct(newID, .keep_all=TRUE)
# don't run: write to csv for NetLogo or other GIS option for path calculation
#write.csv(targets,here::here("Diffusion","out_files", "targets_coords_meters.csv"), row.names=FALSE)

# make object storing coordinates (EPSG:3035 - meters)
points_coords_meters <- city_coords_meters %>%
  arrange(PlaceID) %>%
  dplyr::select(newID, XCOORD, YCOORD) %>%
  column_to_rownames(var="newID")

# don't run: write to csv for NetLogo or other GIS option for path calculation
#write.csv(points_coords_meters, here::here("Diffusion", "out_files", "points_coords_meters.csv"), row.names=TRUE)
# points_coords_meters <- read.csv(here::here("Diffusion", "out_files", "points_coords_meters.csv"), header=TRUE, row.names=1, stringsAsFactors=FALSE)

### 11.1. Ruggedness ----
# An alternative to distance-based diffusion

# Add study area digital elevation model (DEM) geotiff
# Original download is WGS84; use the raster reprojected to EPSG3035
dem <- raster(here::here("GIS_data","Rasters","SRTMs_Study_Area_meters.tif"))
  
# When missing, uses -3.4e+38 (-32768/32768), so remove those -
# add NAs in DEM where outside land mass (show as extreme low values)
# modern lowest point in  study area is -7m
dem[dem<(-7)] <- NA
# now min/max is -7, 4783

# use the terrain function to calculate ruggedness by comparing each raster cell to its 8 neighbors (includes adjacent as sharing an edge and as sharing a corner)
dem_rugg <- terrain(dem, opt='TRI', neighbors=8)
# then extract ruggedness values at each city's coordinates
points_geo <- points_coords_meters %>%
  rownames_to_column("newID")
points_geo$rugged <- raster::extract(dem_rugg, points_coords_meters, method='simple')

write.csv(points_geo,here::here("Diffusion", "out_files", "points_meters_geo.csv"), row.names=FALSE)
# or, grab the already-prepped file
points_geo <- read.csv(here::here("Diffusion", "out_files", "points_meters_geo.csv"), header=TRUE, encoding = "UTF-8", stringsAsFactors=FALSE)

mydata_terr <- mydata %>%
  left_join(., points_geo[,c(1,4)], by=c("newID")) %>%
  arrange(Year, ObsID_Yr)

saveRDS(mydata_terr, here::here("in_data", "diffusion_terr.rds"))
# or, grab the alreaddy-prepped file
mydata_terr <- readRDS(here::here("in_data", "diffusion_terr.rds"))

### 11.2. Euclidean distances between settlements ----

# Create a distance matrix with the diagonal and upper half filled in - will need all values in order to do any functions with the matrix
dist_euc_matrix <- points_coords_meters %>%
  dist(., diag = TRUE, upper = TRUE)/1000 # distances in meters, so divide for km
dist_euc_matrix <- as.data.frame(as.matrix(dist_euc_matrix))

write.csv(dist_euc_matrix,here::here("Diffusion", "out_files", "distances_meters_euc.csv"), row.names=TRUE)
dist_euc_matrix <- read.table(here::here("Diffusion", "out_files", "distances_meters_euc.csv"), 
  row.names=1, header=TRUE, sep= ",", encoding = "UTF-8", stringsAsFactors=FALSE)

### 11.3. Route-based distances between settlements: roads + rivers ----

# See "Routing.R" script

# calculate distances btwn settlements and locations snapped to roads

cities <- readOGR(here::here("GIS_data","Vectors", "GderJuden_cleaned"), "GdJ_cleaned_nocov_meters", GDAL1_integer64_policy = TRUE, stringsAsFactors=FALSE)

setts_on <- readOGR(here::here("GIS_data","Vectors", "GderJuden_cleaned"), "GdJ_cleaned_snapped_road100km", GDAL1_integer64_policy = TRUE)

setts_on$XCOORD <- cities$coords.x1
setts_on$YCOORD <- cities$coords.x2
setts_on_df <- data.frame(setts_on)

dist2snap_roads <- pointDistance(setts_on_df[,c(3,4)], # the original coords
                                 setts_on_df[,c(5,6)], 
                                 lonlat=FALSE, allpairs=FALSE)/1000 # the snapped coords; distances in meters, so divide for km

# make new character (not numeric) IDs to be used for indexing
setts_on_df <- setts_on_df %>%
  mutate(newID = paste("ID", PlaceID, sep="_")) %>% # add ID field for dist matrix
  arrange(PlaceID)
dist2snap_roads <- as.data.frame(dist2snap_roads)
rownames(dist2snap_roads) <- setts_on_df$newID

# also, add distance to a road as a variable...
mydata_terr <- dist2snap_roads %>%
  rownames_to_column(var="newID") %>%
  left_join(x=mydata_terr, y=., by="newID") %>% # keep all rows of x (main df), add y (sums)
  arrange(Year, PlaceID)

# majority of settlements are nearby BW routes 
# but some are visibly along alternatives: rivers!
# river shapefiles from Natural Earth II

# use read.table instead of read.csv in order to specify row names 
dist_rivroad <- read.table(here::here("Diffusion","out_files", "distances_mainriv01road_meters.csv"), row.names=1, header=TRUE, sep= ",", encoding = "UTF-8", stringsAsFactors=FALSE)

dist_rivroad <- dist_rivroad/1000 # transform from meters to kilometers

# calc Euclidean distances btwn settlements and locations snapped to roads

setts_on_rr <- readOGR(here::here("GIS_data","Vectors", "GderJuden_cleaned"), "GdJ_cleaned_snapped_mainriv01road100km", GDAL1_integer64_policy = TRUE)

setts_on_rr$XCOORD <- cities$coords.x1
setts_on_rr$YCOORD <- cities$coords.x2
setts_on_rr_df <- data.frame(setts_on_rr)

dist2snap_rivroad <- pointDistance(setts_on_rr_df[,c(3,4)], # the original coords
                                   setts_on_rr_df[,c(5,6)], 
                                   lonlat=FALSE, allpairs=FALSE)/1000 # the snapped coords; distances in meters, so divide for km
dist2snap_rivroad <- as.data.frame(dist2snap_rivroad)


# make new character (not numeric) IDs to be used for indexing
setts_on_rr_df <- setts_on_rr_df %>%
  mutate(newID = paste("ID", PlaceID, sep="_")) %>% # add ID field for dist matrix
  arrange(PlaceID)

rownames(dist2snap_rivroad) <- setts_on_rr_df$newID
rownames(dist_rivroad) <- setts_on_rr_df$newID
colnames(dist_rivroad) <- setts_on_rr_df$newID

# add snapping distances
dist_rivroad_sp <- sweep(dist_rivroad, 2, dist2snap_rivroad[,1], "+") # rows
dist_rivroad_sp <- sweep(dist_rivroad_sp, 1, dist2snap_rivroad[,1], "+") # columns
diag(dist_rivroad_sp) <- 0 # reset the diagonal to 0

# also, add distance to a road/river as a variable
mydata_terr <- dist2snap_rivroad %>%
  rownames_to_column(var="newID") %>%
  left_join(x=mydata_terr, y=., by="newID") %>% # keep all rows of x (main df), add y (sums)
  arrange(Year, PlaceID)

# And add distance to Constance for each city
unique(mydata_terr$PlaceID[mydata_terr$PlaceName=="Konstanz"]) # 980

mydata_terr <- dist_rivroad_sp %>% # With the distance matrix, 
  rownames_to_column(var = "newID") %>% # make sure IDs are a column, for merging,
  dplyr::select(newID, ID_980) %>% # Keep only the Constance column
  left_join(x=mydata_terr, y=., by="newID") %>% # keep all rows of x (main df), add y (distances to Constance), as one-to-many match
  arrange(Year, PlaceID)

names(mydata_terr)[names(mydata_terr) == "ID_980"] <- "dist_Constance"

saveRDS(mydata_terr, here::here("in_data", "diffusion_terr.rds"))
# or, grab the alreaddy-prepped file
mydata_terr <- readRDS(here::here("in_data", "diffusion_terr.rds"))

### 11.4. Adjust temporal diffusion for Euclidean distances ----

# Again, from Hedström's spatial diffusion
# scaled by sqrts of distances to x=1:
# produces higher values when more and close expulsions,
# lower values when fewer or farther expulsions

# create columns for geo diff values
mydata_geo <- mydata_terr %>%
  mutate(dist1_euc = rep(NA, dim(.)[1]))

# calculate geo diff with previous period = 1 year
# currently fills NULLs with 0 instead of NA so that cases don't get dropped
# but for 1000-1009 should really be NA 
# (only a few cases, though, which get dropped anyway because 10 year lag is NA)

for (i in 1: dim(mydata_geo)[1]){ # iterating by rows
  # if prior expulsions for t=1
  if(!is.null(mydata_geo$Expl_IDs_1[[i]][[1]])){
    # collect the distances subset for the alters for city i
    dist.subs <- dist_euc_matrix[mydata_geo$newID[i], 
                                 mydata_geo$Expl_IDs_1[[i]]]
    # exclude Expl in same city in prior period
    dist.subs[dist.subs==0] <- NA 
    # sum all n(Expl)/sqrt(dist to Expl j)
    mydata_geo$dist1_euc[i] <- sum(mydata_geo$Expl_lag1[i]/sqrt(dist.subs), 
                                   na.rm=TRUE)
  } else {mydata_geo$dist1_euc[i] <- 0} # if no prior for t=1, fill 0
}

diff.euc.matrix <- mydata_geo %>%
  dplyr::select(Year, PlaceID, ObsID_Yr, dist1_euc) %>%
  group_by(PlaceID) %>% 
  arrange(Year) %>%
  complete(nesting(PlaceID), Year = seq(min(Year), max(Year), 1L)) %>%
  mutate(dist5_euc = rollapplyr(dist1_euc, 5, sum, # 5 year moving sum
    na.rm=TRUE, fill=NA, partial=TRUE, align="right")) %>%
  mutate(dist10_euc = rollapplyr(dist1_euc, 10, sum, # 10 year moving sum
    na.rm=TRUE, fill=NA, partial=TRUE, align="right")) %>%
  mutate(dist20_euc = rollapplyr(dist1_euc, 20, sum, # 20 year moving sum
    na.rm=TRUE, fill=NA, partial=TRUE, align="right")) %>%
  mutate(dist30_euc = rollapplyr(dist1_euc, 30, sum, # 30 year moving sum
    na.rm=TRUE, fill=NA, partial=TRUE, align="right")) %>%
  ungroup() %>%
  filter(!is.na(ObsID_Yr)) %>%
  arrange(Year, PlaceID)

# If multiple expulsions in the same settlement in a year (V RARE), the distances aren't counted multiple times. However, it makes sense to think that an expulsion in city j of distance m would not have much different impact than two expulsions in city j of distance m.

### 11.5. Adjust temporal diffusion for route-based distance ----

mydata_geo <- mydata_geo %>%
  mutate(dist1_rivroad = rep(NA, dim(.)[1]))

# using roads+rivers
for (i in 1: dim(mydata_geo)[1]){
  if(!is.null(mydata_geo$Expl_IDs_1[[i]][[1]])){
    dist.subs <- dist_rivroad_sp[mydata_geo$newID[i], mydata_geo$Expl_IDs_1[[i]]]
    dist.subs[dist.subs==0] <- NA # some Expl in same city in prior period
    mydata_geo$dist1_rivroad[i] <- sum(mydata_geo$Expl_lag1[i]/sqrt(dist.subs), 
      na.rm=TRUE)
  } else {mydata_geo$dist1_rivroad[i] <- 0} 
}

# create moving sums
# use "complete" again to pad city-year observations to make sure moving sums are for the appropriate years

diff.matrix <- mydata_geo %>%
  dplyr::select(Year, PlaceID, ObsID_Yr, 
                dist1_rivroad) %>%
  group_by(PlaceID) %>% 
  arrange(Year) %>%
  complete(nesting(PlaceID), Year = seq(min(Year), max(Year), 1L)) %>%
  mutate(dist5_rivroad = rollapplyr(dist1_rivroad, 5, sum, # 5 year moving sum
    na.rm=TRUE, fill=NA, partial=TRUE, align="right")) %>%
  mutate(dist10_rivroad = rollapplyr(dist1_rivroad, 10, sum, # 10 year moving sum
    na.rm=TRUE, fill=NA, partial=TRUE, align="right")) %>%
  mutate(dist20_rivroad = rollapplyr(dist1_rivroad, 20, sum, # 20 year moving sum
    na.rm=TRUE, fill=NA, partial=TRUE, align="right")) %>%
  mutate(dist30_rivroad = rollapplyr(dist1_rivroad, 30, sum, # 30 year moving sum
    na.rm=TRUE, fill=NA, partial=TRUE, align="right")) %>%
  ungroup() %>%
  filter(!is.na(ObsID_Yr)) %>%
  arrange(Year, PlaceID)

### 11.6 merge diffusion calculations into the main data ----
# unique ObsID_Yr should be 75331

mydata_geo <- diff.euc.matrix %>%
  dplyr::select(c(1:2,5:7)) %>% # 5,10, 20 yr calculations
  left_join(mydata_geo, ., by=c("Year", "PlaceID")) %>% # keep all x, add y
  arrange(Year, PlaceID)

# merge diffusion calculations into the main data
mydata_geo <- diff.matrix %>%
  dplyr::select(c(1:2,5:7)) %>%
  left_join(mydata_geo, ., by=c("Year", "PlaceID")) %>% # keep all x, add y
  arrange(Year, PlaceID)


saveRDS(mydata_geo, here::here("in_data", "diffusion_geo.rds"))
# or, grab the already-prepped file
mydata_geo <- readRDS(here::here("in_data", "diffusion_geo.rds"))

###---###---###---###---###---###---###---###---###---###---
### 12. Segmented + Spatial Diffusion ------
### distance-scaled count of x=1 in period t-1 for group G
###---###---###---###---###---###---###---###---###---###---

# only include other expulsions in group G,
# scaled by sqrts of distances to x=1:
# produces higher values when more and close expulsions,
# lower values when fewer or farther expulsions

# create columns for dominion group list values
mydata_geo <- mydata_geo %>%
  mutate(DomGrp_peers_10 = vector("list", dim(.)[1]),
         DomGrp_FreeImp_10 = vector("list", dim(.)[1]),
         DomGrp_Princely_10 = vector("list", dim(.)[1]),
         DomGrp_Episc_10 = vector("list", dim(.)[1]))

# 12.1. Subset city alters to cities of type g ----

# Breaking this into separate steps dramatically cuts execution time.

# I am really only interested in the peers of free+imperial, bishop+archbishop, and princely. I already have the IDs of expulsions in these groups lagged 1 year in: Expl_IDs_1_imperial, Expl_IDs_1_free, Expl_IDs_1_PrinceBin, Expl_IDs_1_AuthEpisc
# This script is written to pull all cities of type g in the previous 10 years. Conceivably, a city may not be type g in {t-1}. The one-year lags will include cities that expelled in {t-1} so long as they were type g in {t-10}.

# 1) create temp vector of Dom types for the observation
filter.temp.FreeImp <- c("free", "imperial")
filter.temp.Prince <- c("PrinceBin")
filter.temp.Episc <- c("ArchbishopBin", "BishopBin")

df.temp.FreeImp <- mydata_geo %>%
  dplyr::select(Year, newID, Vrtrb, filter.temp.FreeImp, DomGrp_FreeImp_10) %>% # make it manageable
  filter(rowSums(dplyr::select(., filter.temp.FreeImp)) >= 1) # drop where no types ==1
t.start <- Sys.time()
for (i in 1: dim(df.temp.FreeImp)[1]){
  # 2) create temp vector that is that observation's year
  year.temp <- df.temp.FreeImp$Year[i] 
  # 3) feed row-specific temp vectors to get list of peers' IDs for prior 10 yrs
  # Autonomous/Semi-autonomous (free cities and imperial cities)
  DomGrp.FreeImp.10 <- df.temp.FreeImp %>%
    filter(Year>(year.temp-10) & Year<year.temp) %>% # only cities in period
    filter(Vrtrb == 1) %>% # only that expelled while that type
    dplyr::select(newID) %>% # IDs only
    as.list() # as list to store in list column 
  # (NOTE: empty are not NULL; they are character strings 0L)
  # 4) push list of peer IDs into list column in df
  df.temp.FreeImp$DomGrp_FreeImp_10[i][[1]] <- DomGrp.FreeImp.10
}
Sys.time()-t.start # just over 1 min

t.start <- Sys.time()
# 1: Subset to cities of type g
df.temp.Prince <- mydata_geo %>%
  dplyr::select(Year, newID, Vrtrb, filter.temp.Prince, DomGrp_Princely_10) %>% # make it manageable
  filter(rowSums(dplyr::select(., filter.temp.Prince)) >= 1) # drop where no types ==1
for (i in 1: dim(df.temp.Prince)[1]){
  # 2) create temp vector that is that observation's year
  year.temp <- df.temp.Prince$Year[i] 
  # 3) feed row-specific temp vectors to get list of peers' IDs for prior 10 yrs
    # Princely
  DomGrp.Prince.10 <- df.temp.Prince %>%
    filter(Year>(year.temp-10) & Year<year.temp) %>% # only cities in period
    filter(Vrtrb == 1) %>% # only that expelled while that type
    dplyr::select(newID) %>% # IDs only
    as.list() # as list to store in list column (NOTE: empty are not NULL)
  # 4) push list of peer IDs into list column in df
  df.temp.Prince$DomGrp_Princely_10[i][[1]] <- DomGrp.Prince.10
}
Sys.time()-t.start # about 7 minutes

t.start <- Sys.time()
# 1: Subset to cities of type g
df.temp.Episc <- mydata_geo %>%
  dplyr::select(Year, newID, Vrtrb, filter.temp.Episc, DomGrp_Episc_10) %>% # make it manageable
  filter(rowSums(dplyr::select(., filter.temp.Episc)) >= 1) # drop where no types ==1
for (i in 1: dim(df.temp.Episc)[1]){
  # 2) create temp vector that is that observation's year
  year.temp <- df.temp.Episc$Year[i] 
  # 3) feed row-specific temp vectors to get list of peers' IDs for prior 10 yrs
  # Episcopal
  DomGrp.Episc.10 <- df.temp.Episc %>%
    dplyr::select(Year, newID, Vrtrb, filter.temp.Episc) %>% # make it manageable
    filter(rowSums(dplyr::select(., filter.temp.Episc)) >= 1) %>% # drop where no types ==1
    filter(Year>(year.temp-10) & Year<year.temp) %>% # only cities in period
    filter(Vrtrb == 1) %>% # only that expelled while that type
    dplyr::select(newID) %>% # IDs only
    as.list() # as list to store in list column (NOTE: empty are not NULL)
  # 4) push list of peer IDs into list column in df
  df.temp.Episc$DomGrp_Episc_10[i][[1]] <- DomGrp.Episc.10
}
Sys.time()-t.start # about 4 minutes

# for comparison: all peers of any type in past 10 years
# (this takes much longer because one can't subset outside the loop and feed in a smaller dataframe)
city.types <- c("imperial", "free", "KingBin", "PrinceBin", "LordBin", "minorBin", "ArchbishopBin", "BishopBin")
for (i in 1: dim(mydata_geo)[1]){
  # 1) create temp vector of Dom types for the observation
  filter.temp <- mydata_geo[i,] %>%
    dplyr::select(city.types) %>% # only relevant Dom variables
    dplyr::select(which(colSums(.) == 1)) %>% # only where condition is 1
    names(.) # get col names in vector
  # 2) create temp vector that is that observation's year
  year.temp <- mydata_geo$Year[i] 
  # 3) feed row-specific temp vectors to get list of peers' IDs for prior 10 yrs
  DomGrp.peers.10 <- mydata_geo %>%
    dplyr::select(Year, newID, Vrtrb, filter.temp) %>% # make it manageable
    filter(rowSums(dplyr::select(., filter.temp)) >= 1) %>% # drop where no types ==1
    filter(Year>(year.temp-10) & Year<year.temp) %>% # only cities in period
    filter(Vrtrb == 1) %>% # only that expelled while that type
    dplyr::select(newID) %>% # IDs only
    as.list() # as list to store in list column (NOTE: empty are not NULL)
  # 4) push list of peer IDs into list column in df
  mydata_geo$DomGrp_peers_10[i][[1]] <- DomGrp.peers.10
}

mydata_geo <- mydata_geo %>%
  dplyr::select(-DomGrp_FreeImp_10,-DomGrp_Princely_10, -DomGrp_Episc_10) %>%
  left_join(df.temp.FreeImp[,c("Year", "newID", "DomGrp_FreeImp_10")], 
             by=c("Year", "newID")) %>% # keep all x, add y
  left_join(df.temp.Prince[,c("Year", "newID", "DomGrp_Princely_10")],
             by=c("Year", "newID")) %>% # keep all x, add y
  left_join(df.temp.Episc[,c("Year", "newID", "DomGrp_Episc_10")],
             by=c("Year", "newID")) %>% # keep all x, add y
  arrange(Year, PlaceID)

saveRDS(mydata_geo, here::here("in_data", "diffusion_const.rds"))
# or, grab the already-prepped file
mydata_geo <- readRDS(here::here("in_data", "diffusion_const.rds"))


### 12.2. Filter Expl_IDs_1 to exclude those not in type g ----
# Subset will check each individually and preserve repeated in Expl_IDs_1, not repeated in DomGrp_[peers]_10; this is the desired result.
# Fill with list(NULL) because that's appropriate for a list column and what will be used in the next stage of code to calculate the distance-weighted diffusion measure.

mydata_geo <- mydata_geo %>%
  mutate(Expl_IDs_1_DomGrp = # create new var
           # apply function to each row using Expl_IDs_1 and DomGrp_peers_10:
           # subset items in Expl if in DomGrp
           map2(Expl_IDs_1, DomGrp_peers_10, 
                .f = function(x,y) subset(x, unlist(x) %in% unlist(y)))) %>%
  mutate(Expl_IDs_1_FreeImp = # create new var
    # apply function to each row using Expl_IDs_1 and DomGrp_Free_10:
    # subset items in Expl if in DomGrp
           map2(Expl_IDs_1, DomGrp_FreeImp_10, 
                .f = function(x,y) subset(x, unlist(x) %in% unlist(y))),
    Expl_IDs_1_Princely = map2(Expl_IDs_1, DomGrp_Princely_10, 
           .f = function(x,y) subset(x, unlist(x) %in% unlist(y))),
    Expl_IDs_1_Episc = map2(Expl_IDs_1, DomGrp_Episc_10, 
           .f = function(x,y) subset(x, unlist(x) %in% unlist(y)))) 
  
# The result preseved empty lists, some as character(0) and some as logical(0). Inconsistent types for empty lists will produce problems when iterating through them in the next step. Replace those empty lists with list(NULL).

mydata_geo$Expl_IDs_1_DomGrp[lengths(mydata_geo$Expl_IDs_1_DomGrp) == 0] <- list(NULL)
mydata_geo$Expl_IDs_1_FreeImp[lengths(mydata_geo$Expl_IDs_1_FreeImp) == 0] <- list(NULL)
mydata_geo$Expl_IDs_1_Princely[lengths(mydata_geo$Expl_IDs_1_Princely) == 0] <- list(NULL)
mydata_geo$Expl_IDs_1_Episc[lengths(mydata_geo$Expl_IDs_1_Episc) == 0] <- list(NULL)

### 12.3. The count of lagged expulsions within the dominion group ----
# It is the length of the filtered IDs list
mydata_geo <- mydata_geo %>%
  mutate(Expl_lag1_DomGrp = map_dbl(Expl_IDs_1_DomGrp,
                                    ~length(.x)),
         Expl_lag1_FreeImp = map_dbl(Expl_IDs_1_FreeImp,
                                  ~length(.x)),
         Expl_lag1_Princely = map_dbl(Expl_IDs_1_Princely,
                              ~length(.x)),
         Expl_lag1_Episc = map_dbl(Expl_IDs_1_Episc,
                              ~length(.x)))

#table(mydata_geo$Expl_lag1_DomGrp, mydata_geo$Vrtrb)

# Where no IDs (because no expulsions in that group), length is 0; it will always be 0 for cities that are not in the DomGrp
  
# 12.4. Distance-scaling for group-limited expulsions ----

# create columns for geo diff values
mydata_geo <- mydata_geo %>%
  mutate(const1_DomType = rep(NA, dim(.)[1]),
         const1_FreeImp = rep(NA, dim(.)[1]),
         const1_Princely = rep(NA, dim(.)[1]),
         const1_Episc = rep(NA, dim(.)[1]))

# feed filtered list to dist matrix calculation with previous period = 1 year
# (currently fills NULLs with 0 instead of NA so that cases don't get dropped
# but for 1000-1009 should really be NA -
# only a few cases, though, which get dropped anyway because 10 year lag is NA)
# each sub-group loop takes ~30 seconds

# Any peers
for (i in 1: dim(mydata_geo)[1]){
  if(!is.null(mydata_geo$Expl_IDs_1_DomGrp[[i]][[1]])){
    dist.subs <- dist_rivroad_sp[mydata_geo$newID[i], mydata_geo$Expl_IDs_1_DomGrp[[i]]]
    dist.subs[dist.subs==0] <- NA # some Expl in same city in prior period
    mydata_geo$const1_DomType[i] <- sum(mydata_geo$Expl_lag1_DomGrp[i]/sqrt(dist.subs), 
                                        na.rm=TRUE)
  } else {mydata_geo$const1_DomType[i] <- 0} 
}

# Autonomy-seeking cities
for (i in 1: dim(mydata_geo)[1]){
  if(!is.null(mydata_geo$Expl_IDs_1_FreeImp[[i]][[1]])){
    dist.subs <- dist_rivroad_sp[mydata_geo$newID[i], mydata_geo$Expl_IDs_1_FreeImp[[i]]]
    dist.subs[dist.subs==0] <- NA # some Expl in same city in prior period
    mydata_geo$const1_FreeImp[i] <- 
      sum(mydata_geo$Expl_lag1_FreeImp[i]/sqrt(dist.subs), na.rm=TRUE)
  } else {mydata_geo$const1_FreeImp[i] <- 0} 
}
# Princely cities
for (i in 1: dim(mydata_geo)[1]){
  if(!is.null(mydata_geo$Expl_IDs_1_Princely[[i]][[1]])){
    dist.subs <- dist_rivroad_sp[mydata_geo$newID[i], mydata_geo$Expl_IDs_1_Princely[[i]]]
    dist.subs[dist.subs==0] <- NA # some Expl in same city in prior period
    mydata_geo$const1_Princely[i] <- sum(mydata_geo$Expl_lag1_Princely[i]/sqrt(dist.subs), 
                                        na.rm=TRUE)
  } else {mydata_geo$const1_Princely[i] <- 0} 
}
# Episcopal cities
for (i in 1: dim(mydata_geo)[1]){
  if(!is.null(mydata_geo$Expl_IDs_1_Episc[[i]][[1]])){
    dist.subs <- dist_rivroad_sp[mydata_geo$newID[i], mydata_geo$Expl_IDs_1_Episc[[i]]]
    dist.subs[dist.subs==0] <- NA # some Expl in same city in prior period
    mydata_geo$const1_Episc[i] <- sum(mydata_geo$Expl_lag1_Episc[i]/sqrt(dist.subs), 
                                        na.rm=TRUE)
  } else {mydata_geo$const1_Episc[i] <- 0} 
}

# each is very non-linear (like the non-segmented measure), but can't log them because many values are 0 and log(0)=-Inf
summary(mydata_geo$const1_DomType)
summary(mydata_geo$const1_FreeImp)
summary(mydata_geo$const1_Princely)
summary(mydata_geo$const1_Episc)

# problem: if multiple expulsions in the same settlement, the distances aren't counted multiple times - underestimating

# Fill in the values for different lags
mydata_geo <- mydata_geo %>%
  group_by(PlaceID) %>% 
  arrange(Year) %>%
  complete(nesting(PlaceID), Year = seq(min(Year), max(Year), 1L)) %>%
  mutate(Expl_sum5_DomGrp =
           rollapplyr(Expl_lag1_DomGrp, 5, sum, # 5 year moving sum
                      na.rm=TRUE, fill=NA, partial=TRUE, align="right"),
         Expl_sum10_DomGrp =
           rollapplyr(Expl_lag1_DomGrp, 10, sum, # 10 year moving sum
                      na.rm=TRUE, fill=NA, partial=TRUE, align="right"),
         Expl_sum20_DomGrp =
           rollapplyr(Expl_lag1_DomGrp, 20, sum, # 20 year moving sum
                      na.rm=TRUE, fill=NA, partial=TRUE, align="right"),
         Expl_sum30_DomGrp =
           rollapplyr(Expl_lag1_DomGrp, 30, sum, # 30 year moving sum
                      na.rm=TRUE, fill=NA, partial=TRUE, align="right"),
         const5_DomType = 
           rollapplyr(const1_DomType, 5, sum, # 5 year
                      na.rm=TRUE, fill=NA, partial=TRUE, align="right"),
         const10_DomType = 
           rollapplyr(const1_DomType, 10, sum, # 10 year
                      na.rm=TRUE, fill=NA, partial=TRUE, align="right"),
         const20_DomType = 
           rollapplyr(const1_DomType, 20, sum, # 20 year
                      na.rm=TRUE, fill=NA, partial=TRUE, align="right"),
         const30_DomType = 
           rollapplyr(const1_DomType, 30, sum, # 30 year
                      na.rm=TRUE, fill=NA, partial=TRUE, align="right"),
         # Free and imperial measures
         Expl_sum5_FreeImp = 
           rollapplyr(Expl_lag1_FreeImp, 5, sum, # 5 year
                      na.rm=TRUE, fill=NA, partial=TRUE, align="right"),
         Expl_sum10_FreeImp = 
           rollapplyr(Expl_lag1_FreeImp, 10, sum, # 10 year
                      na.rm=TRUE, fill=NA, partial=TRUE, align="right"),
         Expl_sum20_FreeImp = 
           rollapplyr(Expl_lag1_FreeImp, 20, sum, # 20 year
                      na.rm=TRUE, fill=NA, partial=TRUE, align="right"),
         Expl_sum30_FreeImp = 
           rollapplyr(Expl_lag1_FreeImp, 30, sum, # 30 year
                      na.rm=TRUE, fill=NA, partial=TRUE, align="right"),
         const5_FreeImp = 
           rollapplyr(const1_FreeImp, 5, sum, # 5 year
                      na.rm=TRUE, fill=NA, partial=TRUE, align="right"),
         const10_FreeImp = 
           rollapplyr(const1_FreeImp, 10, sum, # 10 year
                      na.rm=TRUE, fill=NA, partial=TRUE, align="right"),
         const20_FreeImp = 
           rollapplyr(const1_FreeImp, 20, sum, # 20 year
                      na.rm=TRUE, fill=NA, partial=TRUE, align="right"),
         const30_FreeImp = 
           rollapplyr(const1_FreeImp, 30, sum, # 30 year
                      na.rm=TRUE, fill=NA, partial=TRUE, align="right"),
         # Princely measures
         Expl_sum5_Princely = 
           rollapplyr(Expl_lag1_Princely, 5, sum, # 5 year
                      na.rm=TRUE, fill=NA, partial=TRUE, align="right"),
         Expl_sum10_Princely = 
           rollapplyr(Expl_lag1_Princely, 10, sum, # 10 year
                      na.rm=TRUE, fill=NA, partial=TRUE, align="right"),
         Expl_sum20_Princely = 
           rollapplyr(Expl_lag1_Princely, 20, sum, # 20 year
                      na.rm=TRUE, fill=NA, partial=TRUE, align="right"),
         Expl_sum30_Princely = 
           rollapplyr(Expl_lag1_Princely, 30, sum, # 30 year
                      na.rm=TRUE, fill=NA, partial=TRUE, align="right"),
         const5_Princely = 
           rollapplyr(const1_Princely, 5, sum, # 5 year
                      na.rm=TRUE, fill=NA, partial=TRUE, align="right"),
         const10_Princely = 
           rollapplyr(const1_Princely, 10, sum, # 10 year
                      na.rm=TRUE, fill=NA, partial=TRUE, align="right"),
         const20_Princely = 
           rollapplyr(const1_Princely, 20, sum, # 20 year
                      na.rm=TRUE, fill=NA, partial=TRUE, align="right"),
         const30_Princely = 
           rollapplyr(const1_Princely, 30, sum, # 30 year
                      na.rm=TRUE, fill=NA, partial=TRUE, align="right"),
         # now episcopal measures
         Expl_sum5_Episc = 
           rollapplyr(Expl_lag1_Episc, 5, sum, # 5 year
                      na.rm=TRUE, fill=NA, partial=TRUE, align="right"),
         Expl_sum10_Episc = 
           rollapplyr(Expl_lag1_Episc, 10, sum, # 10 year
                      na.rm=TRUE, fill=NA, partial=TRUE, align="right"),
         Expl_sum20_Episc = 
           rollapplyr(Expl_lag1_Episc, 20, sum, # 20 year
                      na.rm=TRUE, fill=NA, partial=TRUE, align="right"),
         Expl_sum30_Episc = 
           rollapplyr(Expl_lag1_Episc, 30, sum, # 30 year
                      na.rm=TRUE, fill=NA, partial=TRUE, align="right"),
         const5_Episc = 
           rollapplyr(const1_Episc, 5, sum, # 5 year
                      na.rm=TRUE, fill=NA, partial=TRUE, align="right"),
         const10_Episc = 
           rollapplyr(const1_Episc, 10, sum, # 10 year
                      na.rm=TRUE, fill=NA, partial=TRUE, align="right"),
         const20_Episc = 
           rollapplyr(const1_Episc, 20, sum, # 20 year
                      na.rm=TRUE, fill=NA, partial=TRUE, align="right"),
         const30_Episc = 
           rollapplyr(const1_Episc, 30, sum, # 30 year
                      na.rm=TRUE, fill=NA, partial=TRUE, align="right")) %>%
  ungroup() %>%
  filter(!is.na(ObsID_Yr)) %>%
  arrange(Year, PlaceID)

###---###---###---###---###---###---###---###---###---###---
# 13. Alternative: Climate data ####
###---###---###---###---###---###---###---###---###---###---

# Since Anderson, Johnson, and Koyama (2017) found a relationship between
# temperature deviations in pogroms, I should check for this, too.
# I can't use climate deviation reprojections from Guiot, Corona, & 
# ESCARSEL Members (2010) - often used by economists, including AJK (2017) -
# because it doesn't cover the Alps - missing for lots of my cases.
# An alternative is the Community Climate Systems Model, 850-1850 (CCSM 4.0), 
# which is better data anyway (it's the standard for climate scientists).

# get the growing season precipitation and temperature data
#source(here::here("Diffusion","Climate_prep_annual.R"))
climate <- readRDS(here::here("in_data", "CCSM_climate_annual.rds"))

# lag climate measures - AJK (2017) use 1 year
climate <- climate %>%
  group_by(newID) %>%
  arrange(Year) %>% # order within city by year, so lags will be correct
  mutate(prc_lag1 = lag(prc), 
         temp_lag1 = lag(temp)) %>% # lag 1 year
  ungroup()

# merge in the temp deviations! 
# all.y=FALSE will drop off any climate obs without matching cities in this data
mydata_geo <- merge(mydata_geo, climate, by.x=c("Year","newID"), by.y=c("Year", "newID"), all.x=TRUE, all.y=FALSE)

###---###---###---###---###---###---###---###---###---###---
### 14. Other recoding before analysis ####----####----####----
###---###---###---###---###---###---###---###---###---###---

# mydata_geo_2 <- merge(mydata_terr, mydata_geo[,c(5,232:297)], by="ObsID_Yr", all.x=TRUE, all.y=TRUE)
# 
# mydata_geo <- mydata_geo_2

# contrast observations before and after 1418 through binary categorization as well as adding a time-decaying post-Council function

mydata_geo <- mydata_geo %>%
  mutate(postConstance = ifelse(Year>=1418, 1, 0),
         dist_Constance_log = ifelse(dist_Constance>0,
                                     log(dist_Constance),0))

mydata_geo <- mydata_geo %>%
  mutate(onroad = ifelse(dist2snap_roads<1, 1, 0))

# how many roads intersect each settlement?

# cleaned lines file
routes <- readOGR(here::here("GIS_data","Vectors", "medieval_routes"), "routes_all-segments_meters", GDAL1_integer64_policy = TRUE)
# drop columns not needed for this
routes <- routes[,c(1:8)]
# add a small buffer; using raster::buffer with dissolve=FALSE avoids an rgeos error when buffers overlap
routes_bf <- buffer(routes, width=5000, dissolve=FALSE) # 5km from each side
# points - city_coords_meters_sp added above, in Locations section

# get intersect
cities_bf_net <- intersect(city_coords_meters_sp, routes_bf)
summary(cities_bf_net) # 958 intersections

# extract dataframe, then create table of count of duplicates
setts_on_routes <- cities_bf_net %>%
  as.data.frame() %>%
  dplyr::select(PlaceName, PlaceID) %>%
  mutate_if(is.factor, as.character) %>%
  group_by(PlaceID) %>%
  summarize(RouteCount = n()) %>%
  arrange(PlaceID)

saveRDS(setts_on_routes, here::here("in_data", "setts_on_routes.rds"))
# or, grab the already-prepped file
setts_on_routes <- readRDS(here::here("in_data", "setts_on_routes.rds"))

# merge in the route intersection counts!
mydata_geo <- merge(mydata_geo, setts_on_routes, by.x=c("PlaceID"), by.y=c("PlaceID"), all.x=TRUE, all.y=FALSE)

# lots of NA - not on road - replace with 0
# also, to deal with skew, take log2 
# but FYI log2(1)=0, can use 0.1 for 0 values to keep dif btwn 0 and 1
mydata_geo <- mydata_geo %>%
  mutate_at(c("RouteCount"), ~ifelse(is.na(.),0,.)) %>%
  mutate(RouteLog2 = log2(RouteCount)) %>%
  mutate_at(c("RouteLog2"), ~ifelse(is.finite(.),.,log2(0.1)))

saveRDS(mydata_geo, here::here("in_data", "diffusion_final_rivroad.rds"))

###---###---###---###---###---###---###---###---###---###---
### 15. Footnotes and Revisions ####----####----####----
###---###---###---###---###---###---###---###---###---###---

# Check whether I can use centrality scores
## centrality scores from Becker, Hsiao, Pfaff, and Rubin (2024), shared privately pre-publication

city_net <- read_dta(here::here("in_data", "BPR_data.dta")) %>%
  dplyr::select(city, coordn, coorde, degree, closeness, betweenness, eigenvector) %>%
  filter(!is.na(degree)) #328 cities left

intersect(unique(mydata_geo$PlaceName), city_net$city) # 102 shared?

# check for spelling/language differences
city_matches <- mydata_geo %>%
  dplyr::select(PlaceName, XCOORD, YCOORD) %>% # coords will help judge match
  distinct() %>%
  mutate(city = PlaceName) %>% # make column to compare for merge
  merge(city_net, by="city", all=TRUE) # alpha order
# now just have to look manually for matches that should happen...
# to crossref EPSG3035(mine) with WGS84(BHPR), use epsg.io online
city_net$city[city_net$city=="Arnhem"] <- "Arnheim"
city_net$city[city_net$city=="Brussels"] <- "Bruessel"
city_net$city[city_net$city=="Cologne"] <- "Koeln"
city_net$city[city_net$city=="Culemburg"] <- "Culemborg"
city_net$city[city_net$city=="Daventer"] <- "Deventer"
city_net$city[city_net$city=="Eltville"] <- "Eltvil"
city_net$city[city_net$city=="Frankfurt am Main"] <- "Frankfurt"
city_net$city[city_net$city=="Freiburg"] <- "Freiburg N"
city_net$city[city_net$city=="Fribourg"] <- "Freiburg S"
city_net$city[city_net$city=="Geneva"] <- "Genf"
city_net$city[city_net$city=="Haguenau"] <- "Hagenau"
city_net$city[city_net$city=="Hanover"] <- "Hannover"
city_net$city[city_net$city=="Mulhouse"] <- "Muelhausen"
city_net$city[city_net$city=="Nijmegen"] <- "Nimwegen"
city_net$city[city_net$city=="Nuremberg"] <- "Nuernberg"
city_net$city[city_net$city=="Rottenburg am Neckar"] <- "Rottenburg"
city_net$city[city_net$city=="Selestat"] <- "Schlettstadt"
city_net$city[city_net$city=="St Gallen"] <- "St. Gallen"
city_net$city[city_net$city=="Sint Truiden"] <- "Sankt Truiden"
city_net$city[city_net$city=="Strasbourg"] <- "Strassburg"

intersect(unique(mydata_geo$PlaceName), city_net$city) # 120 shared; not enough

# How many cities had multiple expulsions? how many expulsions were not the first a city? (just the model_data sample)

cityIDnames <- mydata_geo %>% dplyr::select(PlaceID, PlaceName) %>% distinct()

cityExpltable <- model_data %>%
  dplyr::select(PlaceID, Year, Vrtrb) %>%
  join(cityIDnames, by = "PlaceID") %>%
  group_by(PlaceID, PlaceName) %>%
  arrange(Year) %>%
  mutate(VrtTot = cumsum(Vrtrb)) 

cityExplmultiple <- cityExpltable %>%
  summarize(CityTotal = max(VrtTot)) %>%
  filter(CityTotal>1)

# Should commercial development be binarized? Should Jewish infrastructure be or binarized (already 0-4)?

with(model_data, table(Mkt2, Vrtrb)) # four conceptually distinct ordinal categories, with reasonable distribution of expulsions  among them, but make a new binary anyway
mydata_geo$Mkt_bin <- ifelse(mydata_geo$Mkt2>0, 1, 0)

with(model_data, table(J_dev_sum, Vrtrb)) # scalar without much reason to divide between them, other than that there is a qualitative difference between 0, 1, and 4 - can do a specification subbing J_dev_bin

###---###---###---###---###---###---###---###---###---###---
### 16. Final prep and scaling for analysis ####----####----####----
###---###---###---###---###---###---###---###---###---###---

# or, grab the already-prepped file
mydata_geo <- readRDS(here::here("in_data", "diffusion_final_rivroad.rds"))


# export a table of all expulsions with a few details
# mydata_geo %>%
#   filter(Vrtrb2==1) %>% # include all attempted expulsions
#   dplyr::select(PlaceID, PlaceName, Year, Period, Vrtrb, Vrtrb2,
#                 DomTenure, DomParty, AuthTotal, BundTotPrev,
#                 Amt, Castellan, Rat, Schoeffen, Schlthss,
#                 Mkt2, Mint, Frgn,
#                 Wine, Cloth, Leather, Grain, Agric, Lumber, Mine, Salt,
#                 J_dev_sum, VfgTotal,
#                 RouteCount, Sector_f) %>%
#   write_excel_csv(here::here("Diffusion","out_files", "Expulsions_cleaned.csv"))

# rescale non-binary vars to play nice in Bayesian logistic regression

model_vars <- c("Vrtrb",
                "Expl_sum5","Expl_sum10",
                "dist1_euc","dist5_euc","dist10_euc",
                "dist1_rivroad","dist5_rivroad","dist10_rivroad",
                "Expl_lag1_DomGrp","Expl_sum5_DomGrp","Expl_sum10_DomGrp",
                "Expl_lag1_FreeImp","Expl_sum5_FreeImp","Expl_sum10_FreeImp",
                "Expl_lag1_Princely","Expl_sum5_Princely","Expl_sum10_Princely",
                "Expl_lag1_Episc","Expl_sum5_Episc","Expl_sum10_Episc",
                "const1_DomType","const5_DomType","const10_DomType",
                "const1_FreeImp","const5_FreeImp","const10_FreeImp",
                "const1_Princely","const5_Princely","const10_Princely",
                "const1_Episc","const5_Episc","const10_Episc",
                "postConstance","dist_Constance",
                "rugged","dist2snap_roads","dist2snap_rivroad","onroad","RouteCount","RouteLog2",
                "King", "Prince", "Lord", "minor", "imperial", "free", "city",
                "RelFund", "RelHouse", "Archbishop", "Bishop",
                "KingBin", "PrinceBin", "LordBin", "minorBin", "cityBin",
                "RelFundBin", "RelHouseBin", "ArchbishopBin", "BishopBin",
                "AuthMajor","AuthRel","AuthNob","AuthRelOnly","AuthNobOnly", "AuthFreeImp", "AuthEpisc",
                "AuthTotal","AuthShare",
                "DomChg","DomTenure",
                "DomChg_sum5","DomChg_sum10",
                "DomRts","DomSld","DomMtg","DomFf","DomCqr","DomFam","DomOth",
                "Residence",
                "StRt","Amt","Castellan","Rat","Mayor","Schoeffen","Schlthss","HighJust",
                "Bund", "BundTotPrev",
                "Diocese",
                "Gmnde","Ldrs","J_dev_sum", "J_dev_bin",
                "Vfg","VfgTotal","Vfg_sum5","VrtTotPrev",
                "Mkt2", "Mkt_bin", "Frgn","Transit","Tolls","Trade","Mint","Guilds",
                "temp_lag1", "Year","PlaceID","ObsID")

model_data_all <- mydata_geo[names(mydata_geo) %in% model_vars]
model_data_all <- model_data_all[complete.cases(model_data_all),]
# model_data_pre <- model_data_all[model_data_all$Year<1385,]
model_data <- model_data_all[model_data_all$Year>=1385,]

nrow(model_data) #36182
length(unique(model_data$PlaceID)) #578
table(model_data$Vrtrb) # 105

nrow(model_data_all) #73016
length(unique(model_data_all$PlaceID)) #808
table(model_data_all$Vrtrb) # 118

# add a factor that can be used instead of continuous year
model_data <- model_data %>%
  mutate(Period15yrs = cut(model_data$Year, 9, dig.lab=4),
         Period9yrs = cut(model_data$Year, 15, dig.lab=4),
         Period5yrs = cut(model_data$Year, 27, dig.lab=4))

# do this one separately because keeping only complete cases drops more when they need a 20 year run-up
model_data_20 <- mydata_geo[names(mydata_geo) %in% c(model_vars, c("Expl_sum20","Expl_sum20_Episc", "Expl_sum20_FreeImp", "dist20_rivroad", "const20_Episc", "const20_FreeImp"))]
model_data_20 <- model_data_20[complete.cases(model_data_20),]
model_data_20 <- model_data_20[model_data_20$Year>=1385,]
model_data_20 <- model_data_20 %>%
  mutate(Period15yrs = cut(model_data_20$Year, 9, dig.lab=4),
         Period9yrs = cut(model_data_20$Year, 15, dig.lab=4),
         Period5yrs = cut(model_data_20$Year, 27, dig.lab=4))


# transform continuous variables to work better in logistic regression by centering them on mean=0 (changing the scale) and then adjusting the range to sd= 0.5

scale_vars_1 <- c("rugged","dist2snap_roads","RouteCount","RouteLog2",
                "Expl_sum5","Expl_sum10",
                "dist1_euc","dist5_euc","dist10_euc",
                "dist1_rivroad","dist5_rivroad","dist10_rivroad",
                "Expl_lag1_DomGrp","Expl_sum5_DomGrp","Expl_sum10_DomGrp",
                "Expl_lag1_FreeImp","Expl_sum5_FreeImp","Expl_sum10_FreeImp",
                "Expl_lag1_Princely","Expl_sum5_Princely","Expl_sum10_Princely",
                "Expl_lag1_Episc","Expl_sum5_Episc","Expl_sum10_Episc",
                "const1_DomType","const5_DomType","const10_DomType",
                "const1_FreeImp","const5_FreeImp","const10_FreeImp",
                "const1_Princely","const5_Princely","const10_Princely",
                "const1_Episc","const5_Episc","const10_Episc",
                "dist_Constance",
                "King", "Prince", "Lord", "minor","city",
                "RelFund", "RelHouse", "Archbishop", "Bishop",
                "AuthMajor",
                "AuthTotal","DomTenure",
                "DomChg_sum5","DomChg_sum10",
                "BundTotPrev",
                "J_dev_sum",
                "VfgTotal", "Vfg_sum5","VrtTotPrev",
                "Mkt2", "temp_lag1",
                "Year")


model_data_scaled <- model_data %>% 
  mutate_at(vars(scale_vars_1), ~scale(.) %>% as.vector) %>%
  mutate_at(vars(scale_vars_1), ~.*0.5 %>% as.vector)

model_data_20_scaled <- model_data_20 %>% 
  mutate_at(vars(c(scale_vars_1,"Expl_sum20","Expl_sum20_Episc", "Expl_sum20_FreeImp", "dist20_rivroad", "const20_Episc", "const20_FreeImp")), ~scale(.) %>% as.vector) %>%
  mutate_at(vars(c(scale_vars_1,"Expl_sum20","Expl_sum20_Episc", "Expl_sum20_FreeImp", "dist20_rivroad", "const20_Episc", "const20_FreeImp")), ~.*0.5 %>% as.vector)

###---###---###---###---###---###---###---###---###---###---
# 17. Output for maps ####
###---###---###---###---###---###---###---###---###---###---

city_coords <- mydata_geo %>%
  arrange(PlaceID) %>%
  dplyr::select(PlaceID, XCOORD, YCOORD) %>%
  distinct()

# summarize: one obs per city

model_data_summ <- model_data %>%
  dplyr::select(PlaceID, Year, Vrtrb) %>%
  group_by(PlaceID) %>%
  summarise(nobs = n(),
            VrtrbTotal = sum(Vrtrb),
            VrtrbEver = ifelse(sum(Vrtrb)>0,1,0)) %>%
  left_join(city_coords, by="PlaceID")

# as spatial object
crs_EPSG3035 <- "+proj=laea +lat_0=52 +lon_0=10 +x_0=4321000 +y_0=3210000 +ellps=GRS80 +towgs84=0,0,0,0,0,0,0 +units=m +no_defs"

model_data_summ_sp <- model_data_summ
coordinates(model_data_summ_sp) <- ~ XCOORD + YCOORD
projection(model_data_summ_sp) <- crs_EPSG3035
writeOGR(model_data_sp, here::here("GIS_data","Vectors","GderJuden_annual"), "GderJuden_annual_pooled1385", driver="ESRI Shapefile")

# summarize in periods: four obs per city

model_data_pds <- model_data %>%
  dplyr::select(PlaceID, Year, Vrtrb) %>%
  mutate(newperiods = case_when(Year>=1385&Year<1415 ~ 1,
                                Year>=1415&Year<1450 ~ 2,
                                Year>=1450&Year<1485 ~ 3,
                                Year>=1485&Year<1520 ~ 4)) %>%
  group_by(PlaceID, newperiods) %>%
  summarise(nobs = n(),
            VrtrbTotal = sum(Vrtrb),
            VrtrbEver = ifelse(sum(Vrtrb)>0,1,0)) %>%
  left_join(city_coords, by="PlaceID")

model_data_pds_sp <- model_data_pds
coordinates(model_data_pds_sp) <- ~ XCOORD + YCOORD
projection(model_data_pds_sp) <- crs_EPSG3035
writeOGR(model_data_pds_sp, here::here("GIS_data","Vectors","GderJuden_annual"), "GderJuden_annual_quarters1385", driver="ESRI Shapefile")

# keep all years and covariates, to do maps and animations also using covariates

model_data_sp <- model_data %>%
  left_join(city_coords, by="PlaceID")
coordinates(model_data_sp) <- ~ XCOORD + YCOORD
projection(model_data_sp) <- crs_EPSG3035
writeOGR(model_data_sp, here::here("GIS_data","Vectors","GderJuden_annual"), "GderJuden_annual_from1385", driver="ESRI Shapefile") # shapefile column names must be 10 characters or less, so names will get truncated (sigh) but since I have the true names and generally plan to only access the .shp from here in R, I'm just going to deal and move on

###---###---###---###---###---###---###---###---###---###---
# 18. Create output report using RMarkdown ####
###---###---###---###---###---###---###---###---###---###---

#install.packages("rmarkdown")
#install.packages("knitr")
#install.packages("pander")
# installing/loading the pandoc package:
#if(!require(installr)) { install.packages("installr"); require(installr)} #load / install+load installr
# Installing pandoc
#install.pandoc()
# on a Mac
#export PATH=$PATH:/Applications/RStudio.app/Contents/MacOS/pandoc

# to run, need mydata_geo & its variants + Expl.matrix - loaded in Rmd

# analysis RMD
t.start <- Sys.time()
rmarkdown::render(here::here("Diffusion","Diffusion_Analysis.Rmd"))
Sys.time() - t.start
browseURL(paste("file://", file.path(getwd(),"Diffusion","Diffusion_Analysis.html"), sep = ""))

# autocorrelation RMD
rmarkdown::render(here::here("Diffusion","Diffusion_Autocorrelation.Rmd"))
browseURL(paste("file://", file.path(getwd(),"Diffusion","Diffusion_Autocorrelation.html"), sep = ""))
